Libraries
library(FactoMineR)
library(tidyr)
library(dplyr)
library(magrittr)
library(ggplot2)
library(ggpubr)
library(factoextra)
library(gridExtra)
Screw Caps Data
raw_data <- read.table("ScrewCaps.csv",header=TRUE, sep=",", dec=".", row.names=1)
summary(raw_data)
Supplier Diameter weight nb.of.pieces Shape Impermeability
Supplier A: 31 Min. :0.4458 Min. :0.610 Min. : 2.000 Shape 1:134 Type 1:172
Supplier B:150 1st Qu.:0.7785 1st Qu.:1.083 1st Qu.: 3.000 Shape 2: 45 Type 2: 23
Supplier C: 14 Median :1.0120 Median :1.400 Median : 4.000 Shape 3: 8
Mean :1.2843 Mean :1.701 Mean : 4.113 Shape 4: 8
3rd Qu.:1.2886 3rd Qu.:1.704 3rd Qu.: 5.000
Max. :5.3950 Max. :7.112 Max. :10.000
Finishing Mature.Volume Raw.Material Price Length
Hot Printing: 62 Min. : 1000 ABS: 21 Min. : 6.477 Min. : 3.369
Lacquering :133 1st Qu.: 15000 PP :148 1st Qu.:11.807 1st Qu.: 6.161
Median : 45000 PS : 26 Median :14.384 Median : 8.086
Mean : 96930 Mean :16.444 Mean :10.247
3rd Qu.:115000 3rd Qu.:18.902 3rd Qu.:10.340
Max. :800000 Max. :46.610 Max. :43.359
Univariate & Bivariate Descriptive Statistics
How is the distribution of the Price? Comment your plot with respect to the quartiles of the Price.
As illustrated in the plots below, the distribution of the price is bimodal.
Finish fixing this
price_density <- ggdensity(raw_data,x="Price",y = "..count..",
color="darkblue",
fill="lightblue",size=0.5,
alpha=0.2,
title = "Price Distribution",
linetype = "solid", add = c("median"),ylim=c(0,20),xlim=c(0,50)) +scale_x_continuous(limits=c(0,50))
price_boxplot <- ggboxplot(raw_data$Price,
width = 0.1,
fill ="lightgray",
outlier.colour = "darkblue", outlier.shape=4.2, ylim=c(0,50),
xlim=c(0,20)) + rotate()
price_boxplot_grob <- ggplotGrob(price_boxplot)
price_density + annotation_custom(grob = price_boxplot_grob, xmin= 0, xmax = 50, ymin= -4.5 , ymax=6)
ggarrange(price_density, price_boxplot, ncol = 2, nrow = 1)
Does the Price depend on the Length? weight?
ggplot(raw_data, aes(x=Length, y=Price)) + geom_point() + geom_smooth(method=lm, color="darkblue")+ theme_minimal()
ggplot(raw_data, aes(x=weight, y=Price)) + geom_point() + geom_smooth(method=lm,color="red")+theme_minimal()
Does the Price depend on the Impermeability? Shape?
The plots below suggests there is some dependence on Impermeability - the medians differ significantly
impermability_plot_1 <- ggdotplot(raw_data,x="Impermeability",y="Price",color = "Impermeability", palette = "jco",binwidth = 1,legend="none")
shape_plot_1 <- ggdotplot(raw_data,x="Shape",y="Price",color = "Shape", palette = "npg",binwidth = 1,legend="none")
impermability_plot_2 <- ggboxplot(raw_data,x="Impermeability",y="Price",color = "Impermeability", palette = "jco",legend="none")
shape_plot_2 <- ggboxplot(raw_data,x="Shape",y="Price",color = "Shape", palette = "npg", legend = "none")
ggarrange(ggarrange(impermability_plot_1,impermability_plot_2,ncol = 2, nrow = 1),
ggarrange(shape_plot_1,shape_plot_2,ncol = 2, nrow = 1),
ncol = 1, nrow = 2)
Which is the less expensive Supplier?
#Do this questions
One important point in explanatory data analysis consists in identifying potential outliers. Could you give points which are suspect regarding the Mature.Volume variable? Give the characteristics (other features) of the observations that seem suspsect
Mature.Volume_plot_2 <- gghistogram(raw_data,x="Mature.Volume",y="..count..")
Using `bins = 30` by default. Pick better value with the argument `bins`.
Mature.Volume_plot_2
We decide to remove the outliers:
library(dplyr)
raw_data <- raw_data %>% filter ( Mature.Volume < 600000 )
Perform a PCA on the dataset ScrewCap, explain briefly what are the aims of a PCA and how categorical variables are handled?
This dataset 1 quantitative variable (Price) and 5 qualitative variables (Supplier, Shape, Impermeability and Finishing) which are considered as illustrative.
res.pca <- PCA(raw_data,quali.sup = c(1,5,6,7,9),quanti.sup = 10,axes = c(1,2))
fviz_pca_ind(res.pca, col.ind="contrib") + scale_color_gradient2(low="blue", mid="white", high="red", midpoint=4) + theme_minimal()
Compute the correlation matrix between the variables and comment it with respect to the correlation circle
We first center and standardize the variables:
#virer les outliers
don <- as.matrix(raw_data[,-c(1,5,6,7,9,10)]) %>% scale()
don_correlation <- cor(don)
On what kind of relationship PCA focuses? Is it a problem?
Comment the PCA outputs. • Comment the position of the categories Impermeability=type 2 and Raw.Material=PS. • Comment the percentage of inertia
barplot(res.pca$eig[,2], names.arg = 1:nrow(res.pca$eig))
drawn <- c("90", "89", "164", "161", "163", "131")
plot.PCA(res.pca, select = drawn, axes = 1:2, choix = 'ind', invisible = 'quali', title = '')
wilks.p <- structure(c(1.39936324848005e-27, 1.02072912984402e-22, 2.01088132732067e-11,
3.37174568997818e-07, 0.394546176405119), .Names = c("Impermeability", "Raw.Material", "Shape", "Supplier", "Finishing"))
wilks.p
Impermeability Raw.Material Shape Supplier Finishing
1.399363e-27 1.020729e-22 2.010881e-11 3.371746e-07 3.945462e-01
sample = sample(rownames(res.pca$call$X), length(rownames(res.pca$call$X)))
res.pca$call$X = res.pca$call$X[sample,]
res.pca$ind$coord = res.pca$ind$coord[sample[!sample %in% rownames(res.pca$ind.sup$coord)],]
res.pca$ind.sup$coord = res.pca$ind.sup$coord[sample[sample %in% rownames(res.pca$ind.sup$coord)],]
drawn <-c("90", "89", "164", "161", "163", "131")
hab <-"Impermeability"
plotellipses(res.pca, axes = 1:2, invisible = 'quali', select = drawn, keepvar = hab, title = '')
NULL
drawn <-c("Length", "Diameter", "weight", "nb.of.pieces", "Price")
plot.PCA(res.pca, select = drawn, axes = 1:2, choix = 'var', title = '')
drawn <- c("Type 1", "Type 2", "Supplier A", "PS", "PP", "Shape 2", "Shape 3", "Shape 1", "Lacquering", "Supplier B", "Supplier C","Hot Printing")
plot.PCA(res.pca, select = drawn, axes = 1:2, choix = 'ind', invisible = c('ind', 'ind.sup'), title = '')
the condition has length > 1 and only the first element will be usedthe condition has length > 1 and only the first element will be usedthe condition has length > 1 and only the first element will be usedthe condition has length > 1 and only the first element will be used
res.pca <- PCA(raw_data,quali.sup = c(1,5,6,7,9),quanti.sup = 10, ncp=3) #ncp = 3
res.hcpc <- HCPC(res.pca, nb.clust = -1
, graph = FALSE)
Chi-squared approximation may be incorrectChi-squared approximation may be incorrectChi-squared approximation may be incorrectChi-squared approximation may be incorrect
drawn <- c("90", "89", "164", "161", "163", "131")
plot.HCPC(res.hcpc, choice = 'map', draw.tree = FALSE, select = drawn, title = '')
dimdesc(res.pca, axes = 1:1)
$Dim.1
$Dim.1$quanti
correlation p.value
Length 0.9853764 3.259183e-147
Diameter 0.9851090 1.784008e-146
weight 0.9774643 1.263294e-129
Price 0.7960132 4.472456e-43
nb.of.pieces -0.2017085 5.139018e-03
Mature.Volume -0.4118157 3.243173e-09
$Dim.1$quali
R2 p.value
Impermeability 0.4767041 2.203784e-28
Raw.Material 0.4309747 9.602186e-24
Shape 0.2024825 3.268025e-09
$Dim.1$category
Estimate p.value
Type 2 1.8697709 2.203784e-28
PS 1.6944602 3.078822e-20
Shape 2 1.4547681 6.874053e-11
ABS -0.1039202 1.566216e-02
Shape 1 -0.3981492 5.692581e-07
PP -1.5905400 1.465743e-20
Type 1 -1.8697709 2.203784e-28
res.hcpc$desc.var
$test.chi2
p.value df
Impermeability 5.318642e-18 2
Raw.Material 5.226547e-17 4
Shape 5.626207e-06 6
Supplier 4.102258e-02 4
$category
$category$`1`
Cla/Mod Mod/Cla Global p.value v.test
Raw.Material=PP 25.000000 97.297297 75.392670 0.0001368886 3.813724
Impermeability=Type 1 22.023810 100.000000 87.958115 0.0049829303 2.808135
Supplier=Supplier C 0.000000 0.000000 7.329843 0.0434829294 -2.019041
Raw.Material=PS 3.846154 2.702703 13.612565 0.0222292828 -2.286427
Raw.Material=ABS 0.000000 0.000000 10.994764 0.0081544536 -2.645607
Impermeability=Type 2 0.000000 0.000000 12.041885 0.0049829303 -2.808135
Shape=Shape 2 2.222222 2.702703 23.560209 0.0002330416 -3.680210
$category$`2`
Cla/Mod Mod/Cla Global p.value v.test
Impermeability=Type 1 74.40476 93.984962 87.958115 0.0002868332 3.626910
Supplier=Supplier C 100.00000 10.526316 7.329843 0.0050545222 2.803538
Raw.Material=PP 74.30556 80.451128 75.392670 0.0173789787 2.378590
Raw.Material=PS 38.46154 7.518797 13.612565 0.0004703729 -3.497084
Impermeability=Type 2 34.78261 6.015038 12.041885 0.0002868332 -3.626910
$category$`3`
Cla/Mod Mod/Cla Global p.value v.test
Impermeability=Type 2 65.2173913 71.428571 12.04188 2.909966e-12 6.982005
Raw.Material=PS 57.6923077 71.428571 13.61257 4.169068e-11 6.597941
Shape=Shape 2 31.1111111 66.666667 23.56021 9.932485e-06 4.418638
Shape=Shape 1 5.3846154 33.333333 68.06283 6.715709e-04 -3.400930
Impermeability=Type 1 3.5714286 28.571429 87.95812 2.909966e-12 -6.982005
Raw.Material=PP 0.6944444 4.761905 75.39267 2.869539e-13 -7.300381
$quanti.var
Eta2 P-value
Length 0.8036361 3.530117e-67
Diameter 0.8025769 5.853470e-67
weight 0.8013378 1.053993e-66
Mature.Volume 0.7588382 8.649758e-59
Price 0.4812030 1.620918e-27
nb.of.pieces 0.1760507 1.243497e-08
$quanti
$quanti$`1`
v.test Mean in category Overall mean sd in category Overall sd p.value
Mature.Volume 11.942982 2.431183e+05 82206.026178 67166.762125 9.103190e+04 7.064414e-33
Diameter -3.255425 8.214269e-01 1.294639 0.254233 9.821218e-01 1.132228e-03
Length -3.297003 6.491733e+00 10.329589 2.056760 7.864783e+00 9.772253e-04
weight -3.536244 1.100262e+00 1.714121 0.315574 1.172854e+00 4.058595e-04
nb.of.pieces -3.780986 3.324324e+00 4.115183 1.274576 1.413225e+00 1.562083e-04
Price -3.857939 1.245686e+01 16.552332 4.115901 7.172431e+00 1.143473e-04
$quanti$`2`
v.test Mean in category Overall mean sd in category Overall sd p.value
nb.of.pieces 5.739229 4.503759 4.115183 1.352381e+00 1.413225e+00 9.510856e-09
Price -2.999298 15.521715 16.552332 4.620374e+00 7.172431e+00 2.706026e-03
weight -5.297059 1.416482 1.714121 3.882302e-01 1.172854e+00 1.176825e-07
Length -5.533133 8.244766 10.329589 2.492726e+00 7.864783e+00 3.145605e-08
Diameter -5.565997 1.032748 1.294639 3.121379e-01 9.821218e-01 2.606582e-08
Mature.Volume -8.031930 47177.255639 82206.026178 3.971314e+04 9.103190e+04 9.595124e-16
$quanti$`3`
v.test Mean in category Overall mean sd in category Overall sd p.value
Length 12.298789 30.295407 10.329589 7.979008e+00 7.864783e+00 9.194180e-35
Diameter 12.294570 3.787033 1.294639 1.000521e+00 9.821218e-01 9.687099e-35
weight 12.254018 4.680731 1.714121 1.164251e+00 1.172854e+00 1.598746e-34
Price 9.282815 30.295414 16.552332 8.814239e+00 7.172431e+00 1.650583e-20
Mature.Volume -3.281665 20542.857143 82206.026178 1.547128e+04 9.103190e+04 1.031962e-03
nb.of.pieces -3.659694 3.047619 4.115183 7.221786e-01 1.413225e+00 2.525166e-04
attr(,"class")
[1] "catdes" "list "
Fisher test Variance
fviz_nbclust(don, kmeans, method = "wss") + geom_vline(xintercept = 3, linetype = 2)
Comments the results and describe precisely one cluster – Add Fisher Test
The cluster 1 is made of individuals sharing : - high values for the variable Mature.Volume. - low values for the variables nb.of.pieces, Price, weight, Length and Diameter (variables are sorted from the weakest).
The cluster 2 is made of individuals sharing : - high values for the variable nb.of.pieces. - low values for the variables Mature.Volume, Diameter, Length, weight and Price (variables are sorted from the weakest).
The cluster 3 is made of individuals such as 89, 90, 131, 161, 163 and 164. This group is characterized by : - high values for the variables Length, Diameter, weight and Price (variables are sorted from the strongest). - low values for the variables nb.of.pieces and Mature.Volume (variables are sorted from the weakest).
If someone ask you why you have selected k components and not k + 1 or k − 1, what is your answer? (could you suggest a strategy to assess the stability of the approach? - are there many differences between the clustering obtained on k components or on the initial data)
res.pca <- PCA(raw_data,quali.sup = c(1,5,6,7,9),quanti.sup = 10,ncp=4)
res.hcpc <- HCPC(res.pca, nb.clust = -1)
res.pca <- PCA(raw_data,quali.sup = c(1,5,6,7,9),quanti.sup = 10,ncp=3)
res.hcpc <- HCPC(res.pca, nb.clust = -1)
res.pca <- PCA(raw_data,quali.sup = c(1,5,6,7,9),quanti.sup = 10,ncp=2)
res.hcpc <- HCPC(res.pca, nb.clust = -1)
res.hcpc <- HCPC(res.pca, nb.clust = -1, graph = FALSE)
Chi-squared approximation may be incorrectChi-squared approximation may be incorrectChi-squared approximation may be incorrectChi-squared approximation may be incorrect
res.hcpc <- HCPC(res.pca, nb.clust = -1, graph = FALSE)
Chi-squared approximation may be incorrectChi-squared approximation may be incorrectChi-squared approximation may be incorrectChi-squared approximation may be incorrect
res.hcpc
**Results for the Hierarchical Clustering on Principal Components**
name description
1 "$data.clust" "dataset with the cluster of the individuals"
2 "$desc.var" "description of the clusters by the variables"
3 "$desc.var$quanti.var" "description of the cluster var. by the continuous var."
4 "$desc.var$quanti" "description of the clusters by the continuous var."
5 "$desc.var$test.chi2" "description of the cluster var. by the categorical var."
6 "$desc.axes$category" "description of the clusters by the categories."
7 "$desc.axes" "description of the clusters by the dimensions"
8 "$desc.axes$quanti.var" "description of the cluster var. by the axes"
9 "$desc.axes$quanti" "description of the clusters by the axes"
10 "$desc.ind" "description of the clusters by the individuals"
11 "$desc.ind$para" "parangons of each clusters"
12 "$desc.ind$dist" "specific individuals"
13 "$call" "summary statistics"
14 "$call$t" "description of the tree"
plot.HCPC(res.hcpc, choice = 'map', draw.tree = FALSE, title = '', select=c("12"))
res.pca <- PCA(raw_data,quali.sup = c(1,5,6,7,9),quanti.sup = 10,ncp=3)
res.hcpc <- HCPC(res.pca, nb.clust = -1, graph = FALSE)
Chi-squared approximation may be incorrectChi-squared approximation may be incorrectChi-squared approximation may be incorrectChi-squared approximation may be incorrect
res.hcpc
**Results for the Hierarchical Clustering on Principal Components**
name description
1 "$data.clust" "dataset with the cluster of the individuals"
2 "$desc.var" "description of the clusters by the variables"
3 "$desc.var$quanti.var" "description of the cluster var. by the continuous var."
4 "$desc.var$quanti" "description of the clusters by the continuous var."
5 "$desc.var$test.chi2" "description of the cluster var. by the categorical var."
6 "$desc.axes$category" "description of the clusters by the categories."
7 "$desc.axes" "description of the clusters by the dimensions"
8 "$desc.axes$quanti.var" "description of the cluster var. by the axes"
9 "$desc.axes$quanti" "description of the clusters by the axes"
10 "$desc.ind" "description of the clusters by the individuals"
11 "$desc.ind$para" "parangons of each clusters"
12 "$desc.ind$dist" "specific individuals"
13 "$call" "summary statistics"
14 "$call$t" "description of the tree"
Characterization of each supplier
catdes(raw_data, num.var=6)
Chi-squared approximation may be incorrectChi-squared approximation may be incorrectChi-squared approximation may be incorrect
$test.chi2
p.value df
Raw.Material 4.088669e-21 2
Shape 2.873602e-16 3
Supplier 1.088731e-02 2
$category
$category$`Type 1`
Cla/Mod Mod/Cla Global p.value v.test
Shape=Shape 1 99.23077 76.785714 68.06283 1.043033e-11 6.800436
Raw.Material=PP 97.91667 83.928571 75.39267 1.773212e-11 6.723573
Supplier=Supplier A 72.41379 12.500000 15.18325 1.301492e-02 -2.483361
Raw.Material=PS 30.76923 4.761905 13.61257 5.429478e-15 -7.816541
Shape=Shape 2 51.11111 13.690476 23.56021 2.151940e-15 -7.932260
$category$`Type 2`
Cla/Mod Mod/Cla Global p.value v.test
Shape=Shape 2 48.8888889 95.652174 23.56021 2.151940e-15 7.932260
Raw.Material=PS 69.2307692 78.260870 13.61257 5.429478e-15 7.816541
Supplier=Supplier A 27.5862069 34.782609 15.18325 1.301492e-02 2.483361
Raw.Material=PP 2.0833333 13.043478 75.39267 1.773212e-11 -6.723573
Shape=Shape 1 0.7692308 4.347826 68.06283 1.043033e-11 -6.800436
$quanti.var
Eta2 P-value
Diameter 0.47062626 6.604215e-28
Length 0.46804072 1.049429e-27
weight 0.45675032 7.728264e-27
Price 0.43301606 4.512224e-25
Mature.Volume 0.07171395 1.801495e-04
$quanti
$quanti$`Type 1`
v.test Mean in category Overall mean sd in category Overall sd p.value
Mature.Volume 3.691294 91225.988095 82206.026178 9.338486e+04 9.103190e+04 2.231162e-04
Price -9.070449 14.805996 16.552332 4.819967e+00 7.172431e+00 1.185272e-19
weight -9.315716 1.420835 1.714121 6.707159e-01 1.172854e+00 1.211330e-20
Length -9.430150 8.338742 10.329589 4.357114e+00 7.864783e+00 4.095012e-21
Diameter -9.456161 1.045344 1.294639 5.411724e-01 9.821218e-01 3.194554e-21
$quanti$`Type 2`
v.test Mean in category Overall mean sd in category Overall sd p.value
Diameter 9.456161 3.115573 1.294639 1.449522 9.821218e-01 3.194554e-21
Length 9.430150 24.871426 10.329589 11.600832 7.864783e+00 4.095012e-21
weight 9.315716 3.856391 1.714121 1.708742 1.172854e+00 1.211330e-20
Price 9.070449 29.308174 16.552332 8.516118 7.172431e+00 1.185272e-19
Mature.Volume -3.691294 16321.086957 82206.026178 13496.587327 9.103190e+04 2.231162e-04
attr(,"class")
[1] "catdes" "list "
res.famd <- FAMD (raw_data, ncp = 5, graph = TRUE, sup.var = c(1,5,7,10), axes = c(1,2), row.w = NULL, tab.comp = NULL)
res.hcpc.famd <- HCPC(res.famd, nb.clust = -1
, graph = TRUE)
plot.HCPC(res.hcpc.famd, choice = 'map', draw.tree = FALSE, select = c(1,10), title = '')
P= c(Supplier = "Supplier A", Diameter=3.7800000 , weight = 3.780000 , nb.of.pieces = 2 , Shape = "Shape 1" , Impermeability = "Type 1", Finishing="Lacquering", Mature.Volume = 60000, Raw.Material = "PS", Length = "30.118312")
Error: unexpected string constant in:
"Length
P= c(Supplier = ""